{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OPIK: Existing Opik clients will not use updated values for \"url\", \"api_key\", \"workspace\".\n",
      "OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/akshay/.opik.config\n"
     ]
    }
   ],
   "source": [
    "import opik\n",
    "opik.configure(use_local=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dotenv import load_dotenv\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from llama_index.llms.ollama import Ollama\n",
    "\n",
    "def load_llm(model_option):\n",
    "    if model_option == \"Qwen3\":\n",
    "        llm = Ollama(model=\"qwen3\")\n",
    "    else:\n",
    "        llm = Ollama(model=\"deepseek-r1\")\n",
    "    return llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = 'Qwen3'\n",
    "# model_name = 'DeepSeek-R1'\n",
    "llm  = load_llm(model_name)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Trace RAG calls "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import Settings\n",
    "from llama_index.core.callbacks import CallbackManager\n",
    "from opik.integrations.llama_index import LlamaIndexCallbackHandler\n",
    "\n",
    "# A callback handler tp automatically log all LlamaIndex operations to Opik\n",
    "opik_callback_handler = LlamaIndexCallbackHandler()\n",
    "\n",
    "# Integrate handler into LlamaIndex's settings\n",
    "Settings.callback_manager = CallbackManager([opik_callback_handler])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import Opik\n",
    "\n",
    "client = Opik()\n",
    "dataset = client.get_or_create_dataset(name=\"Test dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./eval-data/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input': 'What was the very first programming language Paul Graham used when he began learning to program on the IBM 1401?',\n",
       " 'expected_output': 'He used an early version of Fortran on the IBM 1401.',\n",
       " 'context': 'The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it.'}"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# insert the data into the dataset\n",
    "\n",
    "qa_pairs = [\n",
    "    {\"input\": row[\"Question\"], \"expected_output\": row[\"Answer\"], \"context\": row[\"Context\"]} \n",
    "    for _, row in df.iterrows()\n",
    "]\n",
    "qa_pairs[0]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use insert if you're creating the dataset for the first time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# dataset.insert(qa_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OPIK: Started logging traces to the \"Default Project\" project at https://www.comet.com/opik/api/v1/session/redirect/projects/?trace_id=01960a26-2400-7d46-8307-f339aa10934c&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==.\n"
     ]
    }
   ],
   "source": [
    "from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
    "\n",
    "\n",
    "Settings.llm = llm\n",
    "Settings.embed_model = FastEmbedEmbedding(model_name=\"nomic-ai/nomic-embed-text-v1\")\n",
    "\n",
    "documents = SimpleDirectoryReader(\"./eval-data/paul_graham\").load_data()\n",
    "index = VectorStoreIndex.from_documents(documents)\n",
    "\n",
    "query_engine = index.as_query_engine()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import track\n",
    "\n",
    "@track\n",
    "def my_llm_application(input: str) -> str:\n",
    "    response = query_engine.query(input)\n",
    "    return str(response)\n",
    "\n",
    "def evaluation_task(x):\n",
    "    return {\n",
    "        \"output\": my_llm_application(x['input'])\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik.evaluation.metrics import (\n",
    "    Hallucination,\n",
    "    AnswerRelevance,\n",
    "    ContextPrecision,\n",
    "    ContextRecall\n",
    ")\n",
    "\n",
    "# Define the metrics\n",
    "hallucination_metric = Hallucination()\n",
    "answer_relevance_metric = AnswerRelevance()\n",
    "context_precision_metric = ContextPrecision()\n",
    "context_recall_metric = ContextRecall() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]Retrying llama_index.llms.openai.base.OpenAI._chat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11726, Requested 2376. Please try again in 1m21.024s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}.\n",
      "Retrying llama_index.llms.openai.base.OpenAI._chat in 1.5539399740503337 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11616, Requested 2376. Please try again in 1m19.927s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}.\n",
      "OPIK: LLM provider rate limit error detected. We recommend reducing the amount of parallel requests by setting `task_threads` evaluation parameter to a smaller number\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "Evaluation:   0%|          | 0/5 [00:11<?, ?it/s]\n"
     ]
    },
    {
     "ename": "RateLimitError",
     "evalue": "Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11451, Requested 2376. Please try again in 1m18.278s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mRateLimitError\u001b[39m                            Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[54]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mopik\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevaluation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m evaluate\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m evaluation = \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      4\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      5\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mevaluation_task\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      6\u001b[39m \u001b[43m    \u001b[49m\u001b[43mexperiment_name\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      7\u001b[39m \u001b[43m    \u001b[49m\u001b[43mscoring_metrics\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mhallucination_metric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43manswer_relevance_metric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext_precision_metric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext_recall_metric\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[43m    \u001b[49m\u001b[43mexperiment_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m      9\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodel\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgpt-3.5-turbo\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m     10\u001b[39m \u001b[43m    \u001b[49m\u001b[43m}\u001b[49m\n\u001b[32m     11\u001b[39m \u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/evaluation/evaluator.py:106\u001b[39m, in \u001b[36mevaluate\u001b[39m\u001b[34m(dataset, task, scoring_metrics, experiment_name, project_name, experiment_config, verbose, nb_samples, task_threads, prompt, prompts, scoring_key_mapping, dataset_item_ids)\u001b[39m\n\u001b[32m     96\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m asyncio_support.async_http_connections_expire_immediately():\n\u001b[32m     97\u001b[39m     evaluation_engine = engine.EvaluationEngine(\n\u001b[32m     98\u001b[39m         client=client,\n\u001b[32m     99\u001b[39m         project_name=project_name,\n\u001b[32m   (...)\u001b[39m\u001b[32m    104\u001b[39m         scoring_key_mapping=scoring_key_mapping,\n\u001b[32m    105\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m106\u001b[39m     test_results = \u001b[43mevaluation_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mevaluate_llm_tasks\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    107\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdataset_\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    108\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    109\u001b[39m \u001b[43m        \u001b[49m\u001b[43mnb_samples\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnb_samples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    110\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdataset_item_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdataset_item_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    111\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    113\u001b[39m total_time = time.time() - start_time\n\u001b[32m    115\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m verbose == \u001b[32m1\u001b[39m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/evaluation/engine/engine.py:177\u001b[39m, in \u001b[36mEvaluationEngine.evaluate_llm_tasks\u001b[39m\u001b[34m(self, dataset_, task, nb_samples, dataset_item_ids)\u001b[39m\n\u001b[32m    163\u001b[39m dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(\n\u001b[32m    164\u001b[39m     nb_samples=nb_samples,\n\u001b[32m    165\u001b[39m     dataset_item_ids=dataset_item_ids,\n\u001b[32m    166\u001b[39m )\n\u001b[32m    168\u001b[39m evaluation_tasks: List[EvaluationTask] = [\n\u001b[32m    169\u001b[39m     functools.partial(\n\u001b[32m    170\u001b[39m         \u001b[38;5;28mself\u001b[39m._evaluate_llm_task,\n\u001b[32m   (...)\u001b[39m\u001b[32m    174\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m dataset_items\n\u001b[32m    175\u001b[39m ]\n\u001b[32m--> \u001b[39m\u001b[32m177\u001b[39m test_results = \u001b[43mevaluation_tasks_executor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    178\u001b[39m \u001b[43m    \u001b[49m\u001b[43mevaluation_tasks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_workers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_verbose\u001b[49m\n\u001b[32m    179\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    181\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m test_results\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/evaluation/engine/evaluation_tasks_executor.py:32\u001b[39m, in \u001b[36mexecute\u001b[39m\u001b[34m(evaluation_tasks, workers, verbose)\u001b[39m\n\u001b[32m     26\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m futures.ThreadPoolExecutor(max_workers=workers) \u001b[38;5;28;01mas\u001b[39;00m pool:\n\u001b[32m     27\u001b[39m     test_result_futures = [\n\u001b[32m     28\u001b[39m         pool.submit(evaluation_task) \u001b[38;5;28;01mfor\u001b[39;00m evaluation_task \u001b[38;5;129;01min\u001b[39;00m evaluation_tasks\n\u001b[32m     29\u001b[39m     ]\n\u001b[32m     31\u001b[39m     test_results = [\n\u001b[32m---> \u001b[39m\u001b[32m32\u001b[39m         \u001b[43mtest_result_future\u001b[49m\u001b[43m.\u001b[49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     33\u001b[39m         \u001b[38;5;28;01mfor\u001b[39;00m test_result_future \u001b[38;5;129;01min\u001b[39;00m tqdm.tqdm(\n\u001b[32m     34\u001b[39m             futures.as_completed(\n\u001b[32m     35\u001b[39m                 test_result_futures,\n\u001b[32m     36\u001b[39m             ),\n\u001b[32m     37\u001b[39m             disable=(verbose < \u001b[32m1\u001b[39m),\n\u001b[32m     38\u001b[39m             desc=\u001b[33m\"\u001b[39m\u001b[33mEvaluation\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     39\u001b[39m             total=\u001b[38;5;28mlen\u001b[39m(test_result_futures),\n\u001b[32m     40\u001b[39m         )\n\u001b[32m     41\u001b[39m     ]\n\u001b[32m     43\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m test_results\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/_base.py:449\u001b[39m, in \u001b[36mFuture.result\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m    447\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[32m    448\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._state == FINISHED:\n\u001b[32m--> \u001b[39m\u001b[32m449\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    451\u001b[39m \u001b[38;5;28mself\u001b[39m._condition.wait(timeout)\n\u001b[32m    453\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/_base.py:401\u001b[39m, in \u001b[36mFuture.__get_result\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    399\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._exception:\n\u001b[32m    400\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m401\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._exception\n\u001b[32m    402\u001b[39m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    403\u001b[39m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[32m    404\u001b[39m         \u001b[38;5;28mself\u001b[39m = \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/thread.py:59\u001b[39m, in \u001b[36m_WorkItem.run\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m     56\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[32m     58\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m59\u001b[39m     result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     60\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m     61\u001b[39m     \u001b[38;5;28mself\u001b[39m.future.set_exception(exc)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/evaluation/engine/engine.py:126\u001b[39m, in \u001b[36mEvaluationEngine._evaluate_llm_task\u001b[39m\u001b[34m(self, item, task)\u001b[39m\n\u001b[32m    124\u001b[39m LOGGER.debug(\u001b[33m\"\u001b[39m\u001b[33mTask started, input: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, item_content)\n\u001b[32m    125\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m     task_output_ = \u001b[43mtask\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    127\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[32m    128\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m exception_analyzer.is_llm_provider_rate_limit_error(exception):\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/decorator/base_track_decorator.py:298\u001b[39m, in \u001b[36mBaseTrackDecorator._tracked_sync.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    290\u001b[39m     LOGGER.debug(\n\u001b[32m    291\u001b[39m         logging_messages.EXCEPTION_RAISED_FROM_TRACKED_FUNCTION,\n\u001b[32m    292\u001b[39m         func.\u001b[34m__name__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    295\u001b[39m         exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    296\u001b[39m     )\n\u001b[32m    297\u001b[39m     error_info = error_info_collector.collect(exception)\n\u001b[32m--> \u001b[39m\u001b[32m298\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m exception\n\u001b[32m    299\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    300\u001b[39m     stream_or_stream_manager = \u001b[38;5;28mself\u001b[39m._streams_handler(\n\u001b[32m    301\u001b[39m         result,\n\u001b[32m    302\u001b[39m         track_options.capture_output,\n\u001b[32m    303\u001b[39m         track_options.generations_aggregator,\n\u001b[32m    304\u001b[39m     )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/decorator/base_track_decorator.py:288\u001b[39m, in \u001b[36mBaseTrackDecorator._tracked_sync.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    286\u001b[39m error_info: Optional[ErrorInfoDict] = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m    287\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m288\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    289\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[32m    290\u001b[39m     LOGGER.debug(\n\u001b[32m    291\u001b[39m         logging_messages.EXCEPTION_RAISED_FROM_TRACKED_FUNCTION,\n\u001b[32m    292\u001b[39m         func.\u001b[34m__name__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    295\u001b[39m         exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    296\u001b[39m     )\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[52]\u001b[39m\u001b[32m, line 10\u001b[39m, in \u001b[36mevaluation_task\u001b[39m\u001b[34m(x)\u001b[39m\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluation_task\u001b[39m(x):\n\u001b[32m      9\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33moutput\u001b[39m\u001b[33m\"\u001b[39m: \u001b[43mmy_llm_application\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43minput\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     11\u001b[39m     }\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/decorator/base_track_decorator.py:298\u001b[39m, in \u001b[36mBaseTrackDecorator._tracked_sync.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    290\u001b[39m     LOGGER.debug(\n\u001b[32m    291\u001b[39m         logging_messages.EXCEPTION_RAISED_FROM_TRACKED_FUNCTION,\n\u001b[32m    292\u001b[39m         func.\u001b[34m__name__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    295\u001b[39m         exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    296\u001b[39m     )\n\u001b[32m    297\u001b[39m     error_info = error_info_collector.collect(exception)\n\u001b[32m--> \u001b[39m\u001b[32m298\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m exception\n\u001b[32m    299\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    300\u001b[39m     stream_or_stream_manager = \u001b[38;5;28mself\u001b[39m._streams_handler(\n\u001b[32m    301\u001b[39m         result,\n\u001b[32m    302\u001b[39m         track_options.capture_output,\n\u001b[32m    303\u001b[39m         track_options.generations_aggregator,\n\u001b[32m    304\u001b[39m     )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/opik/decorator/base_track_decorator.py:288\u001b[39m, in \u001b[36mBaseTrackDecorator._tracked_sync.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    286\u001b[39m error_info: Optional[ErrorInfoDict] = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m    287\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m288\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    289\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[32m    290\u001b[39m     LOGGER.debug(\n\u001b[32m    291\u001b[39m         logging_messages.EXCEPTION_RAISED_FROM_TRACKED_FUNCTION,\n\u001b[32m    292\u001b[39m         func.\u001b[34m__name__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    295\u001b[39m         exc_info=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    296\u001b[39m     )\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[52]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36mmy_llm_application\u001b[39m\u001b[34m(input)\u001b[39m\n\u001b[32m      3\u001b[39m \u001b[38;5;129m@track\u001b[39m\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mmy_llm_application\u001b[39m(\u001b[38;5;28minput\u001b[39m: \u001b[38;5;28mstr\u001b[39m) -> \u001b[38;5;28mstr\u001b[39m:\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m     response = \u001b[43mquery_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      6\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(response)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/base/base_query_engine.py:52\u001b[39m, in \u001b[36mBaseQueryEngine.query\u001b[39m\u001b[34m(self, str_or_query_bundle)\u001b[39m\n\u001b[32m     50\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(str_or_query_bundle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m     51\u001b[39m         str_or_query_bundle = QueryBundle(str_or_query_bundle)\n\u001b[32m---> \u001b[39m\u001b[32m52\u001b[39m     query_result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstr_or_query_bundle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     53\u001b[39m dispatcher.event(\n\u001b[32m     54\u001b[39m     QueryEndEvent(query=str_or_query_bundle, response=query_result)\n\u001b[32m     55\u001b[39m )\n\u001b[32m     56\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m query_result\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/query_engine/retriever_query_engine.py:179\u001b[39m, in \u001b[36mRetrieverQueryEngine._query\u001b[39m\u001b[34m(self, query_bundle)\u001b[39m\n\u001b[32m    175\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.callback_manager.event(\n\u001b[32m    176\u001b[39m     CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}\n\u001b[32m    177\u001b[39m ) \u001b[38;5;28;01mas\u001b[39;00m query_event:\n\u001b[32m    178\u001b[39m     nodes = \u001b[38;5;28mself\u001b[39m.retrieve(query_bundle)\n\u001b[32m--> \u001b[39m\u001b[32m179\u001b[39m     response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_response_synthesizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43msynthesize\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    180\u001b[39m \u001b[43m        \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_bundle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    181\u001b[39m \u001b[43m        \u001b[49m\u001b[43mnodes\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnodes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    182\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    183\u001b[39m     query_event.on_end(payload={EventPayload.RESPONSE: response})\n\u001b[32m    185\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/response_synthesizers/base.py:241\u001b[39m, in \u001b[36mBaseSynthesizer.synthesize\u001b[39m\u001b[34m(self, query, nodes, additional_source_nodes, **response_kwargs)\u001b[39m\n\u001b[32m    235\u001b[39m     query = QueryBundle(query_str=query)\n\u001b[32m    237\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m._callback_manager.event(\n\u001b[32m    238\u001b[39m     CBEventType.SYNTHESIZE,\n\u001b[32m    239\u001b[39m     payload={EventPayload.QUERY_STR: query.query_str},\n\u001b[32m    240\u001b[39m ) \u001b[38;5;28;01mas\u001b[39;00m event:\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m     response_str = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    242\u001b[39m \u001b[43m        \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m.\u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    243\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtext_chunks\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m    244\u001b[39m \u001b[43m            \u001b[49m\u001b[43mn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mMetadataMode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLLM\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mnodes\u001b[49m\n\u001b[32m    245\u001b[39m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    246\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    247\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    249\u001b[39m     additional_source_nodes = additional_source_nodes \u001b[38;5;129;01mor\u001b[39;00m []\n\u001b[32m    250\u001b[39m     source_nodes = \u001b[38;5;28mlist\u001b[39m(nodes) + \u001b[38;5;28mlist\u001b[39m(additional_source_nodes)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/response_synthesizers/compact_and_refine.py:43\u001b[39m, in \u001b[36mCompactAndRefine.get_response\u001b[39m\u001b[34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[39m\n\u001b[32m     39\u001b[39m \u001b[38;5;66;03m# use prompt helper to fix compact text_chunks under the prompt limitation\u001b[39;00m\n\u001b[32m     40\u001b[39m \u001b[38;5;66;03m# TODO: This is a temporary fix - reason it's temporary is that\u001b[39;00m\n\u001b[32m     41\u001b[39m \u001b[38;5;66;03m# the refine template does not account for size of previous answer.\u001b[39;00m\n\u001b[32m     42\u001b[39m new_texts = \u001b[38;5;28mself\u001b[39m._make_compact_text_chunks(query_str, text_chunks)\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_response\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m     44\u001b[39m \u001b[43m    \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     45\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtext_chunks\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnew_texts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     46\u001b[39m \u001b[43m    \u001b[49m\u001b[43mprev_response\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprev_response\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     47\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     48\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/response_synthesizers/refine.py:179\u001b[39m, in \u001b[36mRefine.get_response\u001b[39m\u001b[34m(self, query_str, text_chunks, prev_response, **response_kwargs)\u001b[39m\n\u001b[32m    175\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m text_chunk \u001b[38;5;129;01min\u001b[39;00m text_chunks:\n\u001b[32m    176\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m prev_response \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    177\u001b[39m         \u001b[38;5;66;03m# if this is the first chunk, and text chunk already\u001b[39;00m\n\u001b[32m    178\u001b[39m         \u001b[38;5;66;03m# is an answer, then return it\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m179\u001b[39m         response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_give_response_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    180\u001b[39m \u001b[43m            \u001b[49m\u001b[43mquery_str\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kwargs\u001b[49m\n\u001b[32m    181\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    182\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    183\u001b[39m         \u001b[38;5;66;03m# refine response if possible\u001b[39;00m\n\u001b[32m    184\u001b[39m         response = \u001b[38;5;28mself\u001b[39m._refine_response_single(\n\u001b[32m    185\u001b[39m             prev_response, query_str, text_chunk, **response_kwargs\n\u001b[32m    186\u001b[39m         )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/response_synthesizers/refine.py:241\u001b[39m, in \u001b[36mRefine._give_response_single\u001b[39m\u001b[34m(self, query_str, text_chunk, **response_kwargs)\u001b[39m\n\u001b[32m    237\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m response \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._streaming:\n\u001b[32m    238\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    239\u001b[39m         structured_response = cast(\n\u001b[32m    240\u001b[39m             StructuredRefineResponse,\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m             \u001b[43mprogram\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    242\u001b[39m \u001b[43m                \u001b[49m\u001b[43mcontext_str\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcur_text_chunk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    243\u001b[39m \u001b[43m                \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    244\u001b[39m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m,\n\u001b[32m    245\u001b[39m         )\n\u001b[32m    246\u001b[39m         query_satisfied = structured_response.query_satisfied\n\u001b[32m    247\u001b[39m         \u001b[38;5;28;01mif\u001b[39;00m query_satisfied:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/response_synthesizers/refine.py:85\u001b[39m, in \u001b[36mDefaultRefineProgram.__call__\u001b[39m\u001b[34m(self, *args, **kwds)\u001b[39m\n\u001b[32m     83\u001b[39m         answer = answer.model_dump_json()\n\u001b[32m     84\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m85\u001b[39m     answer = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_llm\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m     86\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_prompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     87\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     88\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     89\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m StructuredRefineResponse(answer=answer, query_satisfied=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/llms/llm.py:605\u001b[39m, in \u001b[36mLLM.predict\u001b[39m\u001b[34m(self, prompt, **prompt_args)\u001b[39m\n\u001b[32m    603\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.metadata.is_chat_model:\n\u001b[32m    604\u001b[39m     messages = \u001b[38;5;28mself\u001b[39m._get_messages(prompt, **prompt_args)\n\u001b[32m--> \u001b[39m\u001b[32m605\u001b[39m     chat_response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    606\u001b[39m     output = chat_response.message.content \u001b[38;5;129;01mor\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    607\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/llms/openai_like/base.py:154\u001b[39m, in \u001b[36mOpenAILike.chat\u001b[39m\u001b[34m(self, messages, **kwargs)\u001b[39m\n\u001b[32m    151\u001b[39m     completion_response = \u001b[38;5;28mself\u001b[39m.complete(prompt, formatted=\u001b[38;5;28;01mTrue\u001b[39;00m, **kwargs)\n\u001b[32m    152\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m completion_response_to_chat_response(completion_response)\n\u001b[32m--> \u001b[39m\u001b[32m154\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/instrumentation/dispatcher.py:322\u001b[39m, in \u001b[36mDispatcher.span.<locals>.wrapper\u001b[39m\u001b[34m(func, instance, args, kwargs)\u001b[39m\n\u001b[32m    319\u001b[39m             _logger.debug(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to reset active_span_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    321\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m322\u001b[39m     result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, asyncio.Future):\n\u001b[32m    324\u001b[39m         \u001b[38;5;66;03m# If the result is a Future, wrap it\u001b[39;00m\n\u001b[32m    325\u001b[39m         new_future = asyncio.ensure_future(result)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/core/llms/callbacks.py:173\u001b[39m, in \u001b[36mllm_chat_callback.<locals>.wrap.<locals>.wrapped_llm_chat\u001b[39m\u001b[34m(_self, messages, **kwargs)\u001b[39m\n\u001b[32m    164\u001b[39m event_id = callback_manager.on_event_start(\n\u001b[32m    165\u001b[39m     CBEventType.LLM,\n\u001b[32m    166\u001b[39m     payload={\n\u001b[32m   (...)\u001b[39m\u001b[32m    170\u001b[39m     },\n\u001b[32m    171\u001b[39m )\n\u001b[32m    172\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m173\u001b[39m     f_return_val = \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_self\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    174\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m    175\u001b[39m     callback_manager.on_event_end(\n\u001b[32m    176\u001b[39m         CBEventType.LLM,\n\u001b[32m    177\u001b[39m         payload={EventPayload.EXCEPTION: e},\n\u001b[32m    178\u001b[39m         event_id=event_id,\n\u001b[32m    179\u001b[39m     )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/llms/openai/base.py:383\u001b[39m, in \u001b[36mOpenAI.chat\u001b[39m\u001b[34m(self, messages, **kwargs)\u001b[39m\n\u001b[32m    381\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    382\u001b[39m     chat_fn = completion_to_chat_decorator(\u001b[38;5;28mself\u001b[39m._complete)\n\u001b[32m--> \u001b[39m\u001b[32m383\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mchat_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/llms/openai/base.py:111\u001b[39m, in \u001b[36mllm_retry_decorator.<locals>.wrapper\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m    102\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m f(\u001b[38;5;28mself\u001b[39m, *args, **kwargs)\n\u001b[32m    104\u001b[39m retry = create_retry_decorator(\n\u001b[32m    105\u001b[39m     max_retries=max_retries,\n\u001b[32m    106\u001b[39m     random_exponential=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    109\u001b[39m     max_seconds=\u001b[32m20\u001b[39m,\n\u001b[32m    110\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:338\u001b[39m, in \u001b[36mBaseRetrying.wraps.<locals>.wrapped_f\u001b[39m\u001b[34m(*args, **kw)\u001b[39m\n\u001b[32m    336\u001b[39m copy = \u001b[38;5;28mself\u001b[39m.copy()\n\u001b[32m    337\u001b[39m wrapped_f.statistics = copy.statistics  \u001b[38;5;66;03m# type: ignore[attr-defined]\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m338\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcopy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:477\u001b[39m, in \u001b[36mRetrying.__call__\u001b[39m\u001b[34m(self, fn, *args, **kwargs)\u001b[39m\n\u001b[32m    475\u001b[39m retry_state = RetryCallState(retry_object=\u001b[38;5;28mself\u001b[39m, fn=fn, args=args, kwargs=kwargs)\n\u001b[32m    476\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m477\u001b[39m     do = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43miter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry_state\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretry_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    478\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(do, DoAttempt):\n\u001b[32m    479\u001b[39m         \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:378\u001b[39m, in \u001b[36mBaseRetrying.iter\u001b[39m\u001b[34m(self, retry_state)\u001b[39m\n\u001b[32m    376\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m    377\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m action \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.iter_state.actions:\n\u001b[32m--> \u001b[39m\u001b[32m378\u001b[39m     result = \u001b[43maction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    379\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:420\u001b[39m, in \u001b[36mBaseRetrying._post_stop_check_actions.<locals>.exc_check\u001b[39m\u001b[34m(rs)\u001b[39m\n\u001b[32m    418\u001b[39m retry_exc = \u001b[38;5;28mself\u001b[39m.retry_error_cls(fut)\n\u001b[32m    419\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.reraise:\n\u001b[32m--> \u001b[39m\u001b[32m420\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[43mretry_exc\u001b[49m\u001b[43m.\u001b[49m\u001b[43mreraise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    421\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m retry_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfut\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mexception\u001b[39;00m()\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:187\u001b[39m, in \u001b[36mRetryError.reraise\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    185\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mreraise\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> t.NoReturn:\n\u001b[32m    186\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.last_attempt.failed:\n\u001b[32m--> \u001b[39m\u001b[32m187\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mlast_attempt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    188\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/_base.py:449\u001b[39m, in \u001b[36mFuture.result\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m    447\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[32m    448\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._state == FINISHED:\n\u001b[32m--> \u001b[39m\u001b[32m449\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    451\u001b[39m \u001b[38;5;28mself\u001b[39m._condition.wait(timeout)\n\u001b[32m    453\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/_base.py:401\u001b[39m, in \u001b[36mFuture.__get_result\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    399\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._exception:\n\u001b[32m    400\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m401\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._exception\n\u001b[32m    402\u001b[39m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    403\u001b[39m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[32m    404\u001b[39m         \u001b[38;5;28mself\u001b[39m = \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/tenacity/__init__.py:480\u001b[39m, in \u001b[36mRetrying.__call__\u001b[39m\u001b[34m(self, fn, *args, **kwargs)\u001b[39m\n\u001b[32m    478\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(do, DoAttempt):\n\u001b[32m    479\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m480\u001b[39m         result = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    481\u001b[39m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m:  \u001b[38;5;66;03m# noqa: B902\u001b[39;00m\n\u001b[32m    482\u001b[39m         retry_state.set_exception(sys.exc_info())  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/llama_index/llms/openai/base.py:479\u001b[39m, in \u001b[36mOpenAI._chat\u001b[39m\u001b[34m(self, messages, **kwargs)\u001b[39m\n\u001b[32m    473\u001b[39m message_dicts = to_openai_message_dicts(\n\u001b[32m    474\u001b[39m     messages,\n\u001b[32m    475\u001b[39m     model=\u001b[38;5;28mself\u001b[39m.model,\n\u001b[32m    476\u001b[39m )\n\u001b[32m    478\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.reuse_client:\n\u001b[32m--> \u001b[39m\u001b[32m479\u001b[39m     response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompletions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcreate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    480\u001b[39m \u001b[43m        \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmessage_dicts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    481\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    482\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_model_kwargs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    483\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    484\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    485\u001b[39m     \u001b[38;5;28;01mwith\u001b[39;00m client:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/openai/_utils/_utils.py:279\u001b[39m, in \u001b[36mrequired_args.<locals>.inner.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    277\u001b[39m             msg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMissing required argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquote(missing[\u001b[32m0\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m    278\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg)\n\u001b[32m--> \u001b[39m\u001b[32m279\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py:914\u001b[39m, in \u001b[36mCompletions.create\u001b[39m\u001b[34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)\u001b[39m\n\u001b[32m    871\u001b[39m \u001b[38;5;129m@required_args\u001b[39m([\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m], [\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m    872\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcreate\u001b[39m(\n\u001b[32m    873\u001b[39m     \u001b[38;5;28mself\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    911\u001b[39m     timeout: \u001b[38;5;28mfloat\u001b[39m | httpx.Timeout | \u001b[38;5;28;01mNone\u001b[39;00m | NotGiven = NOT_GIVEN,\n\u001b[32m    912\u001b[39m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[32m    913\u001b[39m     validate_response_format(response_format)\n\u001b[32m--> \u001b[39m\u001b[32m914\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_post\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    915\u001b[39m \u001b[43m        \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/chat/completions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m    916\u001b[39m \u001b[43m        \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaybe_transform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    917\u001b[39m \u001b[43m            \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m    918\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    919\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodel\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    920\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43maudio\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    921\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfrequency_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrequency_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    922\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunction_call\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunction_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    923\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunctions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunctions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    924\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogit_bias\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogit_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    925\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    926\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_completion_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_completion_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    927\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    928\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    929\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodalities\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodalities\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    930\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    931\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mparallel_tool_calls\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mparallel_tool_calls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    932\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprediction\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    933\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpresence_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpresence_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    934\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreasoning_effort\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mreasoning_effort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    935\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mresponse_format\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    936\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mseed\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    937\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mservice_tier\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mservice_tier\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    938\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstop\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    939\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstore\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    940\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    941\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    942\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemperature\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    943\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtool_choice\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtool_choice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    944\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtools\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    945\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_logprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_logprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    946\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_p\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    947\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    948\u001b[39m \u001b[43m                \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweb_search_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mweb_search_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    949\u001b[39m \u001b[43m            \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    950\u001b[39m \u001b[43m            \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    951\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    952\u001b[39m \u001b[43m        \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmake_request_options\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    953\u001b[39m \u001b[43m            \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\n\u001b[32m    954\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    955\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m=\u001b[49m\u001b[43mChatCompletion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    956\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    957\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mStream\u001b[49m\u001b[43m[\u001b[49m\u001b[43mChatCompletionChunk\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    958\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/openai/_base_client.py:1242\u001b[39m, in \u001b[36mSyncAPIClient.post\u001b[39m\u001b[34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[39m\n\u001b[32m   1228\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mpost\u001b[39m(\n\u001b[32m   1229\u001b[39m     \u001b[38;5;28mself\u001b[39m,\n\u001b[32m   1230\u001b[39m     path: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m   1237\u001b[39m     stream_cls: \u001b[38;5;28mtype\u001b[39m[_StreamT] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m   1238\u001b[39m ) -> ResponseT | _StreamT:\n\u001b[32m   1239\u001b[39m     opts = FinalRequestOptions.construct(\n\u001b[32m   1240\u001b[39m         method=\u001b[33m\"\u001b[39m\u001b[33mpost\u001b[39m\u001b[33m\"\u001b[39m, url=path, json_data=body, files=to_httpx_files(files), **options\n\u001b[32m   1241\u001b[39m     )\n\u001b[32m-> \u001b[39m\u001b[32m1242\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m cast(ResponseT, \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m)\u001b[49m)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/openai/_base_client.py:919\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, remaining_retries, stream, stream_cls)\u001b[39m\n\u001b[32m    916\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    917\u001b[39m     retries_taken = \u001b[32m0\u001b[39m\n\u001b[32m--> \u001b[39m\u001b[32m919\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    920\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    921\u001b[39m \u001b[43m    \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    922\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    923\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    924\u001b[39m \u001b[43m    \u001b[49m\u001b[43mretries_taken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries_taken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    925\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/Eigen/ai-engineering-hub/llama-4_vs_deepseek-r1/.venv/lib/python3.12/site-packages/openai/_base_client.py:1023\u001b[39m, in \u001b[36mSyncAPIClient._request\u001b[39m\u001b[34m(self, cast_to, options, retries_taken, stream, stream_cls)\u001b[39m\n\u001b[32m   1020\u001b[39m         err.response.read()\n\u001b[32m   1022\u001b[39m     log.debug(\u001b[33m\"\u001b[39m\u001b[33mRe-raising status error\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1023\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._make_status_error_from_response(err.response) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1025\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._process_response(\n\u001b[32m   1026\u001b[39m     cast_to=cast_to,\n\u001b[32m   1027\u001b[39m     options=options,\n\u001b[32m   (...)\u001b[39m\u001b[32m   1031\u001b[39m     retries_taken=retries_taken,\n\u001b[32m   1032\u001b[39m )\n",
      "\u001b[31mRateLimitError\u001b[39m: Error code: 429 - {'error': {'message': 'Rate limit reached for model `meta-llama/llama-4-scout-17b-16e-instruct` in organization `org_01jr4pf4d3fy5sdn50h7p56rqm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 11451, Requested 2376. Please try again in 1m18.278s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"
     ]
    }
   ],
   "source": [
    "from opik.evaluation import evaluate\n",
    "\n",
    "evaluation = evaluate(\n",
    "    dataset=dataset,\n",
    "    task=evaluation_task,\n",
    "    experiment_name = model_name,\n",
    "    scoring_metrics=[hallucination_metric, answer_relevance_metric, context_precision_metric, context_recall_metric],\n",
    "    experiment_config={\n",
    "        \"model\": \"gpt-3.5-turbo\"\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
